1 Cyclicity, variability, duration and timing of symptoms

1.1 Definitions

These are all definitions that holds for a given number of consecutive cycles of a given user. These metrics can be defined as running metric to reflect the evolutions of the symptoms. Here, we will define these metric over 6 cycles.

1.1.1 Duration

The duration is defined as the average number of days per cycle in which the symptom is reported.

\[ d = \frac{N}{C} \]

d_wide = matrix(
  c(0,0,1,1,0,0,rep(0,20),
    0,0,0,0,0,0,rep(0,20),
    0,0,1,1,0,0,rep(0,20),
    0,0,0,0,0,0,rep(0,20),
    0,0,1,1,0,0,rep(0,20),
    0,0,0,0,0,0,rep(0,20)), 
  nrow = 6, byrow = TRUE)
cvdt_duration = function(d_wide){
  j = which(apply(d_wide, 1, sum)>0)
  if(length(j)>0){
    duration = sum(d_wide[j,])/length(j)
  }else{duration = 0}
  return(duration)
}

cvdt_duration(d_wide)
## [1] 2

where \(N\) is the number of reported symptoms and \(C\) is the number of cycles.

1.1.2 Timing

The timing is the day in the cycle at which the symptom is reported on average (or the average cycleday at which the symptom are reported).

\[ t = \frac{\sum d^{\tt TB}}{N} \] where \(d\) is the cycleday (from -18 to 7) and \(\tt TB\) is a binary variable whose value is 1 when the symptom is reported and 0 when not.

cvdt_timing = function(d_wide){
  cycledays = matrix(rep(c(-par$D:par$Df), nrow(d_wide)), nrow = nrow(d_wide),byrow = TRUE)
  timing = mean(cycledays[d_wide > 0])
  return(timing)
}

cvdt_timing(d_wide)
## [1] -15.5

1.1.3 Variability

The variability decreases as the standard deviation of the number of symptom per cycle increases.

\[ v = \tt sd(n_c) \] where \(n_c\) is the number of symptoms reported in cycle \(c\).

cvdt_variability = function(d_wide){
  variability = sd(apply(d_wide, 1, sum))
  return(variability)
}

cvdt_variability(d_wide)
## [1] 1.095445

1.1.4 Cyclicity

The cyclicity is the probability that the symptom is reported in a predictable window of time with respect to the end of the menstrual cycle.

find_window = function(average_profile, nd, timing){
  D = length(average_profile)
  d = sort(order(average_profile, decreasing = TRUE)[1:nd]) # we take the days with the highest average profile
  conditions = (abs(mean(c(-par$D:par$Df)[d]) - timing)>(nd/1.5)) | # window not centered on timing
    (length(d)>1) & (sum(diff(d)>1)>1) # too many interuptions in d, we take a continuous window of nd around timing
  if(conditions){ 
    j = round(timing)+par$D+1; j_start = (j - floor(nd/2));j_end = (j+ceiling(nd/2)-1)
    if(j_start < 1){j_start = 1; j_end = nd}; if(j_end > D){j_end = D; j_start = D-nd+1}
    d = j_start:j_end
  }
  db = rep(FALSE,D)
  db[d] = TRUE
  return(db)
}

compute_cyclicity_statistics = function(X,  plot = FALSE){
  j = which(apply(X, 1, sum)>0)
  if(length(j)>1){
    N = sum(X); 
    if(N > 1){
      C = nrow(X); D = ncol(X);
      duration = cvdt_duration(X); timing = cvdt_timing(X)
      # average profile #average_profile = apply(X, 2, mean)
      average_profile = apply(X, 2, sum); average_profile = average_profile/length(j)
      # finding the window
      nd = round(duration+2*(C-duration)/C) # number of days for the window; We give a larger margin for short durations
      if(nd<D){
        d = find_window(average_profile, nd, timing)
        p_d = mean(average_profile[d]); p_o = mean(average_profile[!d])
        stat = (p_d - p_o)/mean(c(p_d,p_d,1))
        if(plot){plot(c(-par$D:par$Df), average_profile, type = "b", ylim = c(0,1), col = d+1, pch = c(1,16)[d+1])}
      }else{stat = 0}
    }else{stat = 0}
  }else{stat = 0}
  return(stat)
}
D = par$D + par$Df +1
par$dict_cyclicity_stat = data.frame()

for(d in 0:D){
  cat(d,"\n")
  C = 100; N = d*C; S = C*D
  statistics = c()
  for(i in 1:200){
    X = matrix(sample(c(rep(1,N),rep(0,S - N))), nrow = C, ncol = D)
    compute_cyclicity_statistics(X)
    statistics = c(statistics,compute_cyclicity_statistics(X))
  }
  par$dict_cyclicity_stat = rbind(par$dict_cyclicity_stat, data.frame(d = d, statistics = statistics))
}
## 0 
## 1 
## 2 
## 3 
## 4 
## 5 
## 6 
## 7 
## 8 
## 9 
## 10 
## 11 
## 12 
## 13 
## 14 
## 15 
## 16 
## 17 
## 18 
## 19 
## 20 
## 21 
## 22 
## 23 
## 24 
## 25 
## 26
plot(aggregate(statistics ~ d, par$dict_cyclicity_stat, median), ylim = c(-0.2,1))
abline(h = 0)

ggplot(par$dict_cyclicity_stat, aes(x = statistics, fill = factor(d))) +
  geom_histogram(binwidth = 0.01)+
  facet_grid(d ~ ., scale = "free_y")+xlim(c(-1,1))
## Warning: Removed 54 rows containing missing values (geom_bar).

ggplot(par$dict_cyclicity_stat, aes(x = statistics, col = factor(d))) +
  geom_density(bw = 0.1)+xlim(c(-1,1))

cvdt_cyclicity = function(d_wide){
  cyclicity = max(0,compute_cyclicity_statistics(d_wide))
  return(cyclicity)
}

cvdt_cyclicity(d_wide)
## [1] 0.8571429
cvdt_cyclicity(X)
## [1] 0

1.2 Implementation

1.2.1 Data Preparation

Loading data

load(paste0(IO$output_data, "users.Rdata"), verbose = TRUE)
## Loading objects:
##   users
load(paste0(IO$output_data, "cycles_m.Rdata"), verbose = TRUE)
## Loading objects:
##   cycles_m
colnames(cycles_m)
##  [1] "cycle_id_m"            "n_obs"                
##  [3] "n_days_obs_lut_D"      "n_days_obs_lut_7"     
##  [5] "n_days_obs_foll_Df"    "n_TB"                 
##  [7] "first_TB"              "last_TB"              
##  [9] "TB_stretch"            "user_id"              
## [11] "cycle_nb_m"            "age"                  
## [13] "country"               "height"               
## [15] "weight"                "BC"                   
## [17] "n_days_obs_D_Df"       "n_days_obs_lut_D_7"   
## [19] "f_D_7"                 "f_7_0"                
## [21] "f_0_7"                 "tracking_cluster_n"   
## [23] "tracking_cluster"      "tracking_group"       
## [25] "cycle_nb_m_rel"        "init_TB_group"        
## [27] "valid_transition"      "cycle_nb_m_from_trans"
head(cycles_m)
##                                    cycle_id_m n_obs n_days_obs_lut_D
## 1 002f1fd24f5f162b50392c08c15ce6d15ee861f8_10    23               16
## 2 002f1fd24f5f162b50392c08c15ce6d15ee861f8_11    25               17
## 3 002f1fd24f5f162b50392c08c15ce6d15ee861f8_12    15               15
## 4 002f1fd24f5f162b50392c08c15ce6d15ee861f8_13    16               14
## 5 002f1fd24f5f162b50392c08c15ce6d15ee861f8_14    12               11
## 6  002f1fd24f5f162b50392c08c15ce6d15ee861f8_2     2                0
##   n_days_obs_lut_7 n_days_obs_foll_Df n_TB first_TB last_TB TB_stretch
## 1                5                  7    0       NA      NA         NA
## 2                6                  8    0       NA      NA         NA
## 3                4                  0    0       NA      NA         NA
## 4                7                  0    0       NA      NA         NA
## 5                5                  0    0       NA      NA         NA
## 6                0                  2    0       NA      NA         NA
##                                    user_id cycle_nb_m age        country
## 1 002f1fd24f5f162b50392c08c15ce6d15ee861f8         10  20 United Kingdom
## 2 002f1fd24f5f162b50392c08c15ce6d15ee861f8         11  20 United Kingdom
## 3 002f1fd24f5f162b50392c08c15ce6d15ee861f8         12  20 United Kingdom
## 4 002f1fd24f5f162b50392c08c15ce6d15ee861f8         13  20 United Kingdom
## 5 002f1fd24f5f162b50392c08c15ce6d15ee861f8         14  20 United Kingdom
## 6 002f1fd24f5f162b50392c08c15ce6d15ee861f8          2  20 United Kingdom
##   height weight   BC n_days_obs_D_Df n_days_obs_lut_D_7     f_D_7
## 1    155     46 pill              23                 11 1.0000000
## 2    155     46 pill              25                 11 1.0000000
## 3    155     46 pill              15                 11 1.0000000
## 4    155     46 pill              14                  7 0.6363636
## 5    155     46 pill              11                  6 0.5454545
## 6    155     46 pill               2                  0 0.0000000
##       f_7_0 f_0_7 tracking_cluster_n   tracking_cluster     tracking_group
## 1 0.7142857 0.875                  1   regular_tracking   regular tracking
## 2 0.8571429 1.000                  1   regular_tracking   regular tracking
## 3 0.5714286 0.000                  1   regular_tracking  sporadic tracking
## 4 1.0000000 0.000                  1   regular_tracking  sporadic tracking
## 5 0.7142857 0.000                  2 menstrual_tracking  sporadic tracking
## 6 0.0000000 0.250                  2 menstrual_tracking menstrual tracking
##   cycle_nb_m_rel init_TB_group valid_transition cycle_nb_m_from_trans
## 1              8         [0,1)          invalid                    NA
## 2              9         [0,1)          invalid                    NA
## 3             10         [0,1)          invalid                    NA
## 4             11         [0,1)          invalid                    NA
## 5             12         [0,1)          invalid                    NA
## 6              0         [0,1)          invalid                    NA

Selecting stretched of at least 6 consecutive cycles of regular tracking with at least 6 TB in these consecutive cycles.

n_consecutive_cycles = 6

o = order(cycles_m$user_id, cycles_m$cycle_nb_m)
cycles_m = cycles_m[o,]


# at least 3 TB in 6 consecutive cycles

running_cumsum = function(x, n = n_consecutive_cycles){
  X = data.frame()
  N = length(x)
  for(i in 1:n){
    if(i <= N){
      X = rbind(X, x[i:(N-i+1)])
    }
  }
  y = apply(X, 2, sum)
  i = min(N,n)
  if(i > 1){y[1:(i-1)] = y[i]}
  return(y)
}

n_TB_running = ave(cycles_m$n_TB, by = cycles_m$user_id, FUN = running_cumsum)

valid = ((cycles_m$tracking_group == "regular tracking") & (n_TB_running >= 6) & (cycles_m$BC %in% par$BC_dict$name))
valid[is.na(valid)] = FALSE


valid_stretches = function(x, n = n_consecutive_cycles){
  r = rle(x)
  vs = rep((r$lengths>= n) & (r$values), r$lengths)
  return(vs)
}

stretches_number = function(x){
  r = rle(x)
  ns = rep(1:length(r$lengths), r$lengths )
  return(ns)
}


stretches_num = ave(valid, by = cycles_m$user_id, FUN = stretches_number)
stretches = ave(valid, by = cycles_m$user_id, FUN = valid_stretches)
sum(stretches)
## [1] 160861
cycle_ids = cycles_m$cycle_id_m[which(stretches)]
user_ids = unique(cycles_m$user_id[which(stretches)])


cycles_m$stretch_nb = stretches_num * stretches

cycles_m$stretch_id = paste0(cycles_m$user_id, "_s", cycles_m$stretch_nb)

Collect tracking data for these cycles

batches = unique(users$batch[which(users$user_id %in% user_ids)])
days_selected_cycles = data.frame()
tic()
for(b in unique(batches)){ # unique(c(190,batches[1:10]))
  cat(b, "\n")
  load(paste0(IO$output_data, "days/days_",b,".Rdata"), verbose = TRUE)
  j = which((days$cycle_id_m %in% cycle_ids) & (days$type %in% c(TB,"n_logs"))  & (!is.na(days$cycleday_m_D)))
  if(length(j)>0){
    days_selected_cycles = rbind(days_selected_cycles, days[j,])
  }
}
## 1 
## Loading objects:
##   days
## 2 
## Loading objects:
##   days
## 3 
## Loading objects:
##   days
## 4 
## Loading objects:
##   days
## 5 
## Loading objects:
##   days
## 6 
## Loading objects:
##   days
## 7 
## Loading objects:
##   days
## 8 
## Loading objects:
##   days
## 9 
## Loading objects:
##   days
## 10 
## Loading objects:
##   days
## 11 
## Loading objects:
##   days
## 12 
## Loading objects:
##   days
## 13 
## Loading objects:
##   days
## 14 
## Loading objects:
##   days
## 15 
## Loading objects:
##   days
## 16 
## Loading objects:
##   days
## 17 
## Loading objects:
##   days
## 18 
## Loading objects:
##   days
## 19 
## Loading objects:
##   days
## 20 
## Loading objects:
##   days
## 21 
## Loading objects:
##   days
## 22 
## Loading objects:
##   days
## 23 
## Loading objects:
##   days
## 24 
## Loading objects:
##   days
## 25 
## Loading objects:
##   days
## 26 
## Loading objects:
##   days
## 27 
## Loading objects:
##   days
## 28 
## Loading objects:
##   days
## 29 
## Loading objects:
##   days
## 30 
## Loading objects:
##   days
## 31 
## Loading objects:
##   days
## 32 
## Loading objects:
##   days
## 33 
## Loading objects:
##   days
## 34 
## Loading objects:
##   days
## 35 
## Loading objects:
##   days
## 36 
## Loading objects:
##   days
## 37 
## Loading objects:
##   days
## 38 
## Loading objects:
##   days
## 39 
## Loading objects:
##   days
## 40 
## Loading objects:
##   days
## 41 
## Loading objects:
##   days
## 42 
## Loading objects:
##   days
## 43 
## Loading objects:
##   days
## 44 
## Loading objects:
##   days
## 45 
## Loading objects:
##   days
## 46 
## Loading objects:
##   days
## 47 
## Loading objects:
##   days
## 48 
## Loading objects:
##   days
## 49 
## Loading objects:
##   days
## 50 
## Loading objects:
##   days
## 51 
## Loading objects:
##   days
## 52 
## Loading objects:
##   days
## 53 
## Loading objects:
##   days
## 54 
## Loading objects:
##   days
## 55 
## Loading objects:
##   days
## 56 
## Loading objects:
##   days
## 57 
## Loading objects:
##   days
## 58 
## Loading objects:
##   days
## 59 
## Loading objects:
##   days
## 60 
## Loading objects:
##   days
## 61 
## Loading objects:
##   days
## 62 
## Loading objects:
##   days
## 63 
## Loading objects:
##   days
## 64 
## Loading objects:
##   days
## 65 
## Loading objects:
##   days
## 66 
## Loading objects:
##   days
## 67 
## Loading objects:
##   days
## 68 
## Loading objects:
##   days
## 69 
## Loading objects:
##   days
## 70 
## Loading objects:
##   days
## 71 
## Loading objects:
##   days
## 72 
## Loading objects:
##   days
## 73 
## Loading objects:
##   days
## 74 
## Loading objects:
##   days
## 75 
## Loading objects:
##   days
## 76 
## Loading objects:
##   days
## 77 
## Loading objects:
##   days
## 78 
## Loading objects:
##   days
## 79 
## Loading objects:
##   days
## 80 
## Loading objects:
##   days
## 81 
## Loading objects:
##   days
## 82 
## Loading objects:
##   days
## 83 
## Loading objects:
##   days
## 84 
## Loading objects:
##   days
## 85 
## Loading objects:
##   days
## 86 
## Loading objects:
##   days
## 87 
## Loading objects:
##   days
## 88 
## Loading objects:
##   days
## 89 
## Loading objects:
##   days
## 90 
## Loading objects:
##   days
## 91 
## Loading objects:
##   days
## 92 
## Loading objects:
##   days
## 93 
## Loading objects:
##   days
## 94 
## Loading objects:
##   days
## 95 
## Loading objects:
##   days
## 96 
## Loading objects:
##   days
## 97 
## Loading objects:
##   days
## 98 
## Loading objects:
##   days
## 99 
## Loading objects:
##   days
## 100 
## Loading objects:
##   days
## 101 
## Loading objects:
##   days
## 102 
## Loading objects:
##   days
## 103 
## Loading objects:
##   days
## 104 
## Loading objects:
##   days
## 105 
## Loading objects:
##   days
## 106 
## Loading objects:
##   days
## 107 
## Loading objects:
##   days
## 108 
## Loading objects:
##   days
## 109 
## Loading objects:
##   days
## 110 
## Loading objects:
##   days
## 111 
## Loading objects:
##   days
## 112 
## Loading objects:
##   days
## 113 
## Loading objects:
##   days
## 114 
## Loading objects:
##   days
## 115 
## Loading objects:
##   days
## 116 
## Loading objects:
##   days
## 117 
## Loading objects:
##   days
## 118 
## Loading objects:
##   days
## 119 
## Loading objects:
##   days
## 120 
## Loading objects:
##   days
## 121 
## Loading objects:
##   days
## 122 
## Loading objects:
##   days
## 123 
## Loading objects:
##   days
## 124 
## Loading objects:
##   days
## 125 
## Loading objects:
##   days
## 126 
## Loading objects:
##   days
## 127 
## Loading objects:
##   days
## 128 
## Loading objects:
##   days
## 129 
## Loading objects:
##   days
## 130 
## Loading objects:
##   days
## 131 
## Loading objects:
##   days
## 132 
## Loading objects:
##   days
## 133 
## Loading objects:
##   days
## 134 
## Loading objects:
##   days
## 135 
## Loading objects:
##   days
## 136 
## Loading objects:
##   days
## 137 
## Loading objects:
##   days
## 138 
## Loading objects:
##   days
## 139 
## Loading objects:
##   days
## 140 
## Loading objects:
##   days
## 141 
## Loading objects:
##   days
## 142 
## Loading objects:
##   days
## 143 
## Loading objects:
##   days
## 144 
## Loading objects:
##   days
## 145 
## Loading objects:
##   days
## 146 
## Loading objects:
##   days
## 147 
## Loading objects:
##   days
## 148 
## Loading objects:
##   days
## 149 
## Loading objects:
##   days
## 150 
## Loading objects:
##   days
## 151 
## Loading objects:
##   days
## 152 
## Loading objects:
##   days
## 153 
## Loading objects:
##   days
## 154 
## Loading objects:
##   days
## 155 
## Loading objects:
##   days
## 156 
## Loading objects:
##   days
## 157 
## Loading objects:
##   days
## 158 
## Loading objects:
##   days
## 159 
## Loading objects:
##   days
## 160 
## Loading objects:
##   days
## 161 
## Loading objects:
##   days
## 162 
## Loading objects:
##   days
## 163 
## Loading objects:
##   days
## 164 
## Loading objects:
##   days
## 165 
## Loading objects:
##   days
## 166 
## Loading objects:
##   days
## 167 
## Loading objects:
##   days
## 168 
## Loading objects:
##   days
## 169 
## Loading objects:
##   days
## 170 
## Loading objects:
##   days
## 171 
## Loading objects:
##   days
## 172 
## Loading objects:
##   days
## 173 
## Loading objects:
##   days
## 174 
## Loading objects:
##   days
## 175 
## Loading objects:
##   days
## 176 
## Loading objects:
##   days
## 177 
## Loading objects:
##   days
## 178 
## Loading objects:
##   days
## 179 
## Loading objects:
##   days
## 180 
## Loading objects:
##   days
## 181 
## Loading objects:
##   days
## 182 
## Loading objects:
##   days
## 183 
## Loading objects:
##   days
## 184 
## Loading objects:
##   days
## 185 
## Loading objects:
##   days
## 186 
## Loading objects:
##   days
## 187 
## Loading objects:
##   days
## 188 
## Loading objects:
##   days
## 189 
## Loading objects:
##   days
## 190 
## Loading objects:
##   days
## 191 
## Loading objects:
##   days
## 192 
## Loading objects:
##   days
## 193 
## Loading objects:
##   days
## 194 
## Loading objects:
##   days
## 195 
## Loading objects:
##   days
## 196 
## Loading objects:
##   days
## 197 
## Loading objects:
##   days
## 198 
## Loading objects:
##   days
## 199 
## Loading objects:
##   days
## 200 
## Loading objects:
##   days
## 201 
## Loading objects:
##   days
## 202 
## Loading objects:
##   days
## 203 
## Loading objects:
##   days
## 204 
## Loading objects:
##   days
## 205 
## Loading objects:
##   days
## 206 
## Loading objects:
##   days
## 207 
## Loading objects:
##   days
## 208 
## Loading objects:
##   days
## 209 
## Loading objects:
##   days
## 210 
## Loading objects:
##   days
## 211 
## Loading objects:
##   days
## 212 
## Loading objects:
##   days
## 213 
## Loading objects:
##   days
## 214 
## Loading objects:
##   days
## 215 
## Loading objects:
##   days
## 216 
## Loading objects:
##   days
## 217 
## Loading objects:
##   days
## 218 
## Loading objects:
##   days
## 219 
## Loading objects:
##   days
## 220 
## Loading objects:
##   days
## 221 
## Loading objects:
##   days
## 222 
## Loading objects:
##   days
## 223 
## Loading objects:
##   days
## 224 
## Loading objects:
##   days
## 225 
## Loading objects:
##   days
## 226 
## Loading objects:
##   days
## 227 
## Loading objects:
##   days
## 228 
## Loading objects:
##   days
## 229 
## Loading objects:
##   days
## 230 
## Loading objects:
##   days
toc()
## 621.417 sec elapsed
days = days_selected_cycles
save(days, file = paste0(IO$tmp_data,"days_selected_cycles.Rdata"))

Transform these data in wide format

# load( file = paste0(IO$tmp_data,"days_selected_cycles.Rdata"), verbose = TRUE)
# dim(days)

d_wide = reshape_and_impute(days)
d_wide_with_tracking = d_wide 
d_wide[d_wide == -1] = 0


m = match(d_wide$cycle_id_m, cycles_m$cycle_id_m)
d_wide$stretch_id = cycles_m$stretch_id[m]
d_wide$cycle_nb_m = cycles_m$cycle_nb_m[m]

write_feather(d_wide, path = paste0(IO$tmp_data, "d_wide_without_tracking_for_cvdt.feather"))

1.2.2 Computing the indicators for the selected data

tic()
average_profiles_with_cvdt =  data.frame()
for(stretch_id in unique(d_wide$stretch_id)[1:500]){
  j = which(d_wide$stretch_id == stretch_id)
  XX = as.matrix(d_wide[j,grep("n\\.",colnames(d_wide))])
  average_profiles_this_stretch = data.frame()
  for(i in 1:(nrow(XX)-n_consecutive_cycles+1)){
    X = XX[i:(i+n_consecutive_cycles-1),]
    average_profiles_this_stretch = rbind(average_profiles_this_stretch,
                                          data.frame(stretch_id = stretch_id, 
                                                     cycle_nb = d_wide$cycle_nb_m[j[1]+i-1], 
                                                     duration = cvdt_duration(X), 
                                                     timing = cvdt_timing(X), 
                                                     variability = cvdt_variability(X), 
                                                     cyclicity  = cvdt_cyclicity(X),
                                                     cycleday_m_D = -par$D:par$Df,
                                                     average_profile = apply(X,2,mean))
    )
  }
  average_profiles_with_cvdt = rbind(average_profiles_with_cvdt,
                                     average_profiles_this_stretch)
}
toc()
## 14.137 sec elapsed
average_profiles_with_cvdt$BC = cycles_m$BC[match(average_profiles_with_cvdt$stretch_id,  cycles_m$stretch_id)]
columns_for_unique = c("stretch_id","cycle_nb","duration","timing","variability","cyclicity","BC")
cvdt_stretches = unique(average_profiles_with_cvdt[,columns_for_unique])
cvdt_stretches$user_id = cycles_m$user_id[match(cvdt_stretches$stretch_id, cycles_m$stretch_id)]
cvdt_stretches$cycle_id_m = paste0(cvdt_stretches$user_id, "_", cvdt_stretches$cycle_nb)


write_feather(average_profiles_with_cvdt, path = paste0(IO$tmp_data, "average_profiles_with_cvdt.feather"))
write_feather(cvdt_stretches, path = paste0(IO$tmp_data, "cvdt_stretches.feather"))

1.2.3 Visualisations

ggplot(cvdt_stretches, aes(x = cyclicity, fill = BC))+
  geom_histogram(aes(y = ..density..),binwidth = 0.05, alpha = 0.5, position = "identity")+
  scale_fill_manual(values = cols$BC)

ggplot(cvdt_stretches, aes(x = variability, fill = BC))+ 
  geom_histogram(aes(y = ..density..),binwidth = 0.5, alpha = 0.5, position = "identity")+
  scale_fill_manual(values = cols$BC)

ggplot(cvdt_stretches, aes(x = duration, fill = BC))+ 
  geom_histogram(aes(y = ..density..),bins = 26, alpha = 0.5, position = "identity")+
  scale_fill_manual(values = cols$BC)

ggplot(cvdt_stretches, aes(x = timing, fill = BC))+
  geom_histogram(aes(y = ..density..),bins = 26, alpha = 0.5, position = "identity")+
  scale_fill_manual(values = cols$BC)
## Warning: Removed 12 rows containing non-finite values (stat_bin).

ggplot(cvdt_stretches, aes(y =  cyclicity, x = duration, col = BC))+
  geom_point(alpha = 0.3)+
  scale_color_manual(values = cols$BC)

ggplot(cvdt_stretches, aes(y = variability, x = cyclicity, col = BC))+
  geom_point(alpha = 0.3)+
  scale_color_manual(values = cols$BC)

ggplot(cvdt_stretches, aes(y = variability, x = duration, col = BC))+
  geom_point(alpha = 0.3)+
  scale_color_manual(values = cols$BC)

ggplot(cvdt_stretches, aes(y = timing, x = cyclicity, col = BC))+
  geom_point(alpha = 0.3)+
  scale_color_manual(values = cols$BC)
## Warning: Removed 12 rows containing missing values (geom_point).

plot_ly(cvdt_stretches, x = ~cyclicity, y = ~duration, z = ~timing,
        color = ~variability,
        type = "scatter3d", mode = "markers",
        marker = list(size = 2))
## Warning: Ignoring 12 observations
j = which(!is.na(cvdt_stretches$timing))
cvdt_col = c("duration","timing","variability","cyclicity")

cor(cvdt_stretches[j,cvdt_col ])
##                duration     timing variability   cyclicity
## duration     1.00000000 -0.2234107   0.5761576 -0.06540099
## timing      -0.22341070  1.0000000  -0.2488482  0.36553358
## variability  0.57615757 -0.2488482   1.0000000 -0.13315037
## cyclicity   -0.06540099  0.3655336  -0.1331504  1.00000000
pca = prcomp(cvdt_stretches[j,cvdt_col],
             scale = TRUE, center = TRUE)

ggplot(data.frame(dim = 1:4, sdev = pca$sdev), aes(x = dim, y = sdev))+
  geom_bar(stat = "identity")

pca_points = as.data.frame(pca$x)

pca_points$BC = cvdt_stretches$BC[j]
pca_points$duration = cvdt_stretches$duration[j]
pca_points$timing = cvdt_stretches$timing[j]

pca_vec = as.data.frame(pca$rotation)
pca_vec$attr = rownames(pca_vec)


S = 5
ggplot(pca_points, aes(x = PC1, y = PC2))+ coord_fixed()+
  geom_point( alpha = 0.3, size = 0.5)+
  geom_segment(data = pca_vec, aes(x = 0, y = 0, xend = S*PC1, yend = S*PC2, col = attr), size = 1)+
  facet_grid(BC ~ .)

plot_ly(pca_points, x = ~PC1, y = ~PC2, z = ~PC3,
        color = ~BC,
        type = "scatter3d", mode = "markers",
        marker = list(size = 2))
o = order(cvdt_stretches$cyclicity, decreasing = TRUE)
cvdt_stretches = cvdt_stretches[o,]
cvdt_stretches = cvdt_stretches[sample(1:nrow(cvdt_stretches)),]

ggplot(cvdt_stretches[cvdt_stretches$BC %in% par$BC_dict$name,], aes(x = timing, y = duration,  col = cyclicity))+ # alpha = variability,
  geom_vline(xintercept = 0, col = "gray80", size = 2)+
  geom_hline(yintercept = 0, col = "gray40")+
  geom_segment(aes(x = timing - duration/2, xend = timing + duration/2, yend = duration, alpha = cyclicity), size = 1.2)+
  geom_point(aes(size = variability), alpha = 0.7)+
  scale_color_gradient(low = "gray80", high = "mediumslateblue")+
  scale_x_continuous(breaks = par$x.axis)+
  scale_alpha(range = c(0,0.2))+
  facet_grid(BC ~ .)
## Warning: Removed 12 rows containing missing values (geom_segment).
## Warning: Removed 12 rows containing missing values (geom_point).

o = order(average_profiles_with_cvdt$cyclicity, average_profiles_with_cvdt$duration, decreasing = TRUE)
average_profiles_with_cvdt = average_profiles_with_cvdt[o,]

ggplot(average_profiles_with_cvdt, aes(x = cycleday_m_D, y = stretch_id,  col = BC, alpha = average_profile))+ # alpha = variability,
  geom_vline(xintercept = 0, col = "gray80", size = 2)+
  geom_point()+
  scale_color_manual(values = cols$BC)+
  scale_x_continuous(breaks = par$x.axis)+
  scale_alpha(range = c(0,1))+
  facet_grid( . ~ BC , scale = "free_y")

duration_4 = (round(cvdt_stretches$duration) %in% 3:5)
duration_20 = (round(cvdt_stretches$duration) %in% 19:21)
cyclicity_high = (round(cvdt_stretches$cyclicity,1) >= quantile(cvdt_stretches$cyclicity,p = 0.95))
cyclicity_0 = (round(cvdt_stretches$cyclicity,2) == 0)
cyclicity_low = (round(cvdt_stretches$cyclicity,1) <= 0.2)
cyclicity_mid = (round(cvdt_stretches$cyclicity,1) == 0.5)
variability_high = (round(cvdt_stretches$variability,2) >= quantile(cvdt_stretches$variability,p = 0.95))
variability_mid = (round(cvdt_stretches$variability,2) %in% 
                     seq(quantile(round(cvdt_stretches$variability,2),p = 0.30),quantile(round(cvdt_stretches$variability,2),p = 0.80),by = 0.01))
variability_low = (cvdt_stretches$variability <= quantile(cvdt_stretches$variability,p = 0.20))
timing_3 = (round(cvdt_stretches$timing) %in% -3:-2)
timing_7 = (round(cvdt_stretches$timing) %in% -8:-7)
timing_0 = (round(cvdt_stretches$timing) %in% -1:0)

ex =       which(duration_4 & cyclicity_low & variability_mid)[1]
ex = c(ex, which(            cyclicity_mid & variability_mid & timing_3)[1])
ex = c(ex, which(duration_4 & cyclicity_high & variability_high)[1])
ex = c(ex, which(duration_4 & cyclicity_mid & variability_mid  & timing_0)[1])
ex = c(ex, which(             cyclicity_high & variability_mid)[10])
ex = c(ex, which(duration_20 & cyclicity_mid)[1])
ex = c(ex, which(duration_20 & cyclicity_low)[1])
ex = c(ex, which(!is.na(match(cvdt_stretches$stretch_id, cycles_m$stretch_id[which(cycles_m$user_id == special_pill_user)])))[1])

if(any(is.na(ex))){ex = ex[-which(is.na(ex))]}
length(ex)
## [1] 6
stretch_id_ex = cvdt_stretches$stretch_id[ex]
cycle_ids_ex = paste0(rep(cvdt_stretches$user_id[ex],each = 6),"_",rep(cvdt_stretches$cycle_nb[ex], each = 6) + rep(0:5, length(ex)) ) 

save(cycle_ids_ex, file = paste0(IO$tmp_data, "cycle_ids_ex_v3.Rdata"))

k = which(d_wide_with_tracking$cycle_id_m %in% cycle_ids_ex )

d = melt(d_wide_with_tracking[k,], id.vars = c("user_id","cycle_id_m"))
colnames(d)[match(c("variable","value"),colnames(d))] = c("cycleday_m_D","tender_breasts")
d$cycleday_m_D = as.numeric(gsub("\\.","-",gsub("n\\.","",d$cycleday_m_D)))
d$cycle_nb_m = cycles_m$cycle_nb_m[match(d$cycle_id_m, cycles_m$cycle_id_m)]
d$user_id = factor(d$user_id,levels = cvdt_stretches$user_id[ex])
d$stretch_id = cvdt_stretches$stretch_id[match(  d$cycle_id_m, cvdt_stretches$cycle_id_m)]
#d$user_id = factor(d$user_id,levels = unique(d$user_id))

ggplot_imputed_TB(sel_d = d, facet_grid_y = "user_id")

average_profiles_with_cvdt$s_id = paste0(average_profiles_with_cvdt$stretch_id,"_",average_profiles_with_cvdt$cycle_nb)
k = which(average_profiles_with_cvdt$s_id %in% paste0(cvdt_stretches$stretch_id[ex],"_", cvdt_stretches$cycle_nb[ex]))

selected_average_profiles_with_cvdt = average_profiles_with_cvdt[k,]
selected_average_profiles_with_cvdt$stretch_id = factor(selected_average_profiles_with_cvdt$stretch_id, levels = unique(cvdt_stretches$stretch_id[ex]))
ggplot(selected_average_profiles_with_cvdt,aes(x = cycleday_m_D, y = average_profile, col = stretch_id) )+
  geom_point()+
  geom_line()+
  guides(col = FALSE)+
  facet_grid(stretch_id ~ .)

data.frame(cvdt_stretches$stretch_id[ex],round(cvdt_stretches[ex,cvdt_col],2))
##                         cvdt_stretches.stretch_id.ex. duration timing
## n..185130 83120f9b11c4ee1a056a63485411a019ab04e83c_s1     4.59  -4.90
## n..181360 391aeeff3cd59b86a0c4f0f388779aafab48631e_s2     5.47  -3.11
## n..18876  511fce665cfb11f78a566b190e98e13864638ff8_s4     3.67  -0.45
## n..18874  43315b78b9ba4eeafd2c4f1a04e9d5441ac00a0f_s1     6.19  -3.00
## n..18657  9072ba048e0dce2ea3ed24b25ddfa23417aaacdd_s1    20.75  -3.77
## n..181611 d51e275e982845648f63ec06d0e16ef46466b800_s2    20.73  -5.47
##           variability cyclicity
## n..185130        2.77      0.00
## n..181360        3.50      0.47
## n..18876         2.56      0.53
## n..18874         3.45      0.90
## n..18657         4.43      0.47
## n..181611        3.67      0.00